In this unit we discuss several approximate Bayesian methods that have been presented in the slides of unit D.2.

The Challenger data

y <- c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0) # Failure
temp <- c(53, 57, 58, 63, 66, 67, 67, 67, 68, 69, 70, 70, 70, 70, 72, 73, 75, 75, 76, 76, 78, 79, 81) # Temperature (Farenheit)
X <- cbind(1, temp) # Design matrix

We will employ a relatively vague prior centered at \(0\), namely \(\beta \sim N(0, 100 I_p)\).

B <- diag(100, ncol(X)) # Prior covariance matrix
b <- rep(0, ncol(X)) # Prior mean

Gold standard (Gibbs sampling)

library(BayesLogit)
logit_Gibbs <- function(R, burn_in, y, X, B, b) {
  p <- ncol(X)
  n <- nrow(X)
  out <- matrix(0, R, p) # Initialize an empty matrix to store the values

  P <- solve(B) # Prior precision matrix
  Pb <- P %*% b # Term appearing in the Gibbs sampling

  Xy <- crossprod(X, y - 1 / 2)

  # Initialization
  beta <- rep(0, p)

  # Iterative procedure
  for (r in 1:(R + burn_in)) {

    # Sampling the Pólya-gamma latent variables
    eta <- c(X %*% beta)
    omega <- rpg.devroye(num = n, h = 1, z = eta)

    # Sampling beta
    eig <- eigen(crossprod(X * sqrt(omega)) + P, symmetric = TRUE)

    Sigma <- crossprod(t(eig$vectors) / sqrt(eig$values))
    mu <- Sigma %*% (Xy + Pb)

    A1 <- t(eig$vectors) / sqrt(eig$values)
    beta <- mu + c(matrix(rnorm(1 * p), 1, p) %*% A1)

    # Store the values after the burn-in period
    if (r > burn_in) {
      out[r - burn_in, ] <- beta
    }
  }
  out
}
set.seed(123)

# Running the MCMC
fit_MCMC <- logit_Gibbs(R = 50000, burn_in = 5000, y, X, B, b) # MCMC (gold standard)

Laplace approximation

logit_Laplace <- function(y, X, B, b, tol = 1e-16, maxiter = 10000) {
  P <- solve(B) # Prior precision matrix
  Pb <- P %*% b # Term appearing in the Gibbs sampling

  logpost <- numeric(maxiter)
  Xy <- crossprod(X, y - 0.5)

  # Initialization
  beta <- solve(crossprod(X / 4, X) + P, Xy + Pb)
  eta <- c(X %*% beta)
  w <- tanh(eta / 2) / (2 * eta)
  w[is.nan(w)] <- 0.25

  # First value of the likelihood
  logpost[1] <- sum(y * eta - log(1 + exp(eta))) - 0.5 * t(beta) %*% P %*% beta

  # Iterative procedure
  for (t in 2:maxiter) {
    beta <- solve(qr(crossprod(X * w, X) + P), Xy + Pb)
    eta <- c(X %*% beta)
    w <- tanh(eta / 2) / (2 * eta)
    w[is.nan(w)] <- 0.25

    logpost[t] <- sum(y * eta - log(1 + exp(eta))) - 0.5 * t(beta) %*% P %*% beta

    if (logpost[t] - logpost[t - 1] < tol) {
      prob <- plogis(eta)
      return(list(
        mu = c(beta), Sigma = solve(crossprod(X * prob * (1 - prob), X) + P),
        Convergence = cbind(Iteration = (1:t) - 1, logpost = logpost[1:t])
      ))
    }
  }
  stop("The algorithm has not reached convergence")
}
library(tictoc)
library(ggplot2)

tic()
fit_Laplace <- logit_Laplace(y, X, B, b)
toc()
0.035 sec elapsed

par(mfrow = c(1, 2))
plot(density(fit_MCMC[, 1]), xlab = expression(beta[1]), lty = "dotted", main = "")
curve(dnorm(x, fit_Laplace$mu[1], sqrt(fit_Laplace$Sigma[1, 1])), add = T)

plot(density(fit_MCMC[, 2]), xlab = expression(beta[2]), lty = "dotted", main = "")
curve(dnorm(x, fit_Laplace$mu[2], sqrt(fit_Laplace$Sigma[2, 2])), add = T)

Variational Bayes

logit_CAVI <- function(y, X, B, b, tol = 1e-16, maxiter = 10000) {

  # Compute the log-determinant of a matrix
  ldet <- function(X) {
    if (!is.matrix(X)) {
      return(log(X))
    }
    determinant(X, logarithm = TRUE)$modulus
  }

  lowerbound <- numeric(maxiter)
  p <- ncol(X)
  n <- nrow(X)

  P <- solve(B)
  Pb <- c(P %*% b)
  Pdet <- ldet(P)

  # Initialization for omega equal to 0.25
  P_vb <- crossprod(X * rep(1 / 4, n), X) + P
  Sigma_vb <- solve(P_vb)
  mu_vb <- Sigma_vb %*% (crossprod(X, y - 0.5) + Pb)
  eta <- c(X %*% mu_vb)
  xi <- sqrt(eta^2 + rowSums(X %*% Sigma_vb * X))
  omega <- tanh(xi / 2) / (2 * xi)
  omega[is.nan(omega)] <- 0.25

  lowerbound[1] <- 0.5 * p + 0.5 * ldet(Sigma_vb) + 0.5 * Pdet - 0.5 * t(mu_vb - b) %*% P %*% (mu_vb - b) + sum((y - 0.5) * eta + log(plogis(xi)) - 0.5 * xi) - 0.5 * sum(diag(P %*% Sigma_vb))

  # Iterative procedure
  for (t in 2:maxiter) {
    P_vb <- crossprod(X * omega, X) + P
    Sigma_vb <- solve(P_vb)
    mu_vb <- Sigma_vb %*% (crossprod(X, y - 0.5) + Pb)

    # Update of xi
    eta <- c(X %*% mu_vb)
    xi <- sqrt(eta^2 + rowSums(X %*% Sigma_vb * X))
    omega <- tanh(xi / 2) / (2 * xi)
    omega[is.nan(omega)] <- 0.25

    lowerbound[t] <- 0.5 * p + 0.5 * ldet(Sigma_vb) + 0.5 * Pdet - 0.5 * t(mu_vb - b) %*% P %*% (mu_vb - b) + sum((y - 0.5) * eta + log(plogis(xi)) - 0.5 * xi) - 0.5 * sum(diag(P %*% Sigma_vb))

    if (abs(lowerbound[t] - lowerbound[t - 1]) < tol) {
      return(list(
        mu = c(mu_vb), Sigma = matrix(Sigma_vb, p, p),
        Convergence = cbind(
          Iteration = (1:t) - 1,
          Lowerbound = lowerbound[1:t]
        ), xi = xi
      ))
    }
  }
  stop("The algorithm has not reached convergence")
}
library(tictoc)

tic()
fit_CAVI <- logit_CAVI(y, X, B, b)
toc()
0.059 sec elapsed

par(mfrow = c(1, 2))
plot(density(fit_MCMC[, 1]), xlab = expression(beta[1]), lty = "dotted", main = "", ylim = c(0,0.09))
curve(dnorm(x, fit_CAVI$mu[1], sqrt(fit_CAVI$Sigma[1, 1])), add = T)

plot(density(fit_MCMC[, 2]), xlab = expression(beta[2]), lty = "dotted", main = "", ylim = c(0,6))
curve(dnorm(x, fit_CAVI$mu[2], sqrt(fit_CAVI$Sigma[2, 2])), add = T)

Expectation propagation

library(EPGLM)

tic()
fit_EP <- EPlogit(X, y, s = B[1, 1])
toc()
0.002 sec elapsed

par(mfrow = c(1, 2))
plot(density(fit_MCMC[, 1]), xlab = expression(beta[1]), lty = "dotted", main = "", ylim = c(0,0.09))
curve(dnorm(x, fit_EP$m[1], sqrt(fit_EP$V[1, 1])), add = T)

plot(density(fit_MCMC[, 2]), xlab = expression(beta[2]), lty = "dotted", main = "", ylim = c(0,6))
curve(dnorm(x, fit_EP$m[2], sqrt(fit_EP$V[2, 2])), add = T)

Comparisons

Means <- data.frame(
  MCMC = colMeans(fit_MCMC),
  Laplace = fit_Laplace$mu,
  VB = fit_CAVI$mu,
  EP = fit_EP$m
)
knitr::kable(Means, digits = 4)
MCMC Laplace VB EP
11.8050 10.5527 11.0425 11.8669
-0.1858 -0.1665 -0.1740 -0.1869
mean_mcmc <- colMeans(fit_MCMC)
Errors <- data.frame(
  Laplace = (fit_Laplace$mu - mean_mcmc),
  VB = (fit_CAVI$mu - mean_mcmc),
  EP = (fit_EP$m - mean_mcmc)
)
apply(Errors, 2, function(x) mean(abs(x)))
   Laplace         VB         EP 
0.63581238 0.38720446 0.03149474 
knitr::kable(Errors, digits = 4)
Laplace VB EP
-1.2523 -0.7626 0.0619
0.0193 0.0118 -0.0011
Errors_perc <- data.frame(
  Laplace = (fit_Laplace$mu - mean_mcmc) / mean_mcmc * 100,
  VB = (fit_CAVI$mu - mean_mcmc) / mean_mcmc * 100,
  EP = (fit_EP$m - mean_mcmc) / mean_mcmc * 100
)
apply(Errors_perc, 2, function(x) mean(abs(x)))
   Laplace         VB         EP 
10.4966927  6.4109456  0.5550979 
knitr::kable(Errors_perc, digits = 4)
Laplace VB EP
-10.6084 -6.4598 0.5244
-10.3850 -6.3620 0.5858

Uncertainty quantification

Sd <- data.frame(
  MCMC = sqrt(diag(var(fit_MCMC))),
  Laplace = sqrt(diag(fit_Laplace$Sigma)),
  VB = sqrt(diag(fit_CAVI$Sigma)),
  EP = sqrt(diag(fit_EP$V))
)
knitr::kable(Sd, digits = 4, row.names = F)
MCMC Laplace VB EP
5.3675 5.0465 4.3489 5.3215
0.0789 0.0741 0.0628 0.0783